from math import sqrt
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
sns.set(style = "darkgrid")
CSV = 'StudentsPerformance.csv'
df = pd.read_csv(CSV)
df.shape, df.columns
((1000, 8),
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object'))
def _isnull(df):
_ = []
for c in df.columns:
cnt = df[pd.isnull(df[c])].shape[0]
if cnt != 0:
_.append(c)
return _
_isnull(df)
[]
df.head()
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
def mediana(x):
i = len(x)
if i%2==0:
i = int(i/2)
return (x[i-1] + x[i+1])/2
else:
i = int((i/2) + 1)
return x[i]
def media_arit(x):
return sum(x)/len(x)
def med_apar(x, t):
return sum(x[t:-t])/len(x)
def med_pond(x):
# val_freq = dict(zip(set(x), [0 for i in range(len(x))]))
# for v in x:
# val_freq[v] += 1
# return = sum([val * freq for val, freq in val_freq.items()])/len(val_freq)
return sum(x)/len(set(x))
R1 = {
'math': {
'mediana': mediana(df['math score']),
'media_arit': media_arit(df['math score']),
'media_apar': med_apar(df['math score'], 10),
'media_pond': med_pond(df['math score']),
},
'reading': {
'mediana': mediana(df['reading score']),
'media_arit': media_arit(df['reading score']),
'media_apar': med_apar(df['reading score'], 10),
'media_pond': med_pond(df['reading score']),
},
'writing': {
'mediana': mediana(df['writing score']),
'media_arit': media_arit(df['writing score']),
'media_apar': med_apar(df['writing score'], 10),
'media_pond': med_pond(df['writing score']),
},
}
R1
{'math': {'mediana': 85.0,
'media_arit': 66.089,
'media_apar': 64.749,
'media_pond': 815.9135802469136},
'reading': {'mediana': 79.0,
'media_arit': 69.169,
'media_apar': 67.669,
'media_pond': 960.6805555555555},
'writing': {'mediana': 82.0,
'media_arit': 68.054,
'media_apar': 66.611,
'media_pond': 883.8181818181819}}
MAD de cada disciplina.¶def variancia(X):
x_ = media_arit(X)
return sum([(xi - x_)**2 for xi in X])/len(X) # populacional; len(x-1) para variância amostral
def desvio_padrao(X):
return sqrt(variancia(X))
def desvio_medio_absoluto(X):
x_ = media_arit(X)
return sum([abs(xi - x_) for xi in X])/len(X)
def MAD(X):
"""Mediana do desvio mediano absoluto"""
mediana_ = mediana(X)
return mediana([abs(xi - mediana_) for xi in X])
R2 = {
'math': {
'variancia': variancia(df['math score']),
'desvio_padrao': desvio_padrao(df['math score']),
'desvio_medio_absoluto': desvio_medio_absoluto(df['math score']),
'MAD': MAD(df['math score'])
},
'reading': {
'variancia': variancia(df['reading score']),
'desvio_padrao': desvio_padrao(df['reading score']),
'desvio_medio_absoluto': desvio_medio_absoluto(df['reading score']),
'MAD': MAD(df['reading score'])
},
'writing': {
'variancia': variancia(df['writing score']),
'desvio_padrao': desvio_padrao(df['writing score']),
'desvio_medio_absoluto': desvio_medio_absoluto(df['writing score']),
'MAD': MAD(df['writing score'])
},
}
R2
{'math': {'variancia': 229.68907900000048,
'desvio_padrao': 15.155496659628165,
'desvio_medio_absoluto': 12.020246,
'MAD': 9.0},
'reading': {'variancia': 212.9524390000001,
'desvio_padrao': 14.59289001534652,
'desvio_medio_absoluto': 11.778606000000005,
'MAD': 8.0},
'writing': {'variancia': 230.6770839999997,
'desvio_padrao': 15.188057281956757,
'desvio_medio_absoluto': 12.200703999999996,
'MAD': 10.0}}
sns.histplot(df['math score'])
plt.show()
sns.histplot(df['reading score'])
plt.show()
sns.histplot(df['writing score'])
plt.show()
sns.boxplot(data=df['math score'])
plt.show()
sns.boxplot(data=df['reading score'])
plt.show()
sns.boxplot(data=df['writing score'])
plt.show()
Coeficiente de correlação de Pearson:
$r = \frac{\Sigma_{i=1}^n (x_i - \overline{x}) (y_i - \overline{y}) } {(n-1) s_x s_y}$
onde,
def pearson_corr(X, Y):
m_x = media_arit(X)
m_y = media_arit(Y)
dif_x = [(xi - m_x) for xi in X]
dif_y = [(yi - m_y) for yi in Y]
prod = list(zip(dif_x, dif_y))
prod = [p[0] * p[1] for p in prod]
numerador = sum(prod)
desvio_padrao_x = desvio_padrao(X)
desvio_padrao_y = desvio_padrao(Y)
denominador = (len(X) - 1) * desvio_padrao_x * desvio_padrao_y
return numerador/denominador
COLS = ['math score', 'reading score', 'writing score']
d_corr = {}
for col1 in COLS:
d_corr[col1] = {col1: None}
for col2 in COLS:
d_corr[col1][col2] = pearson_corr(df[col1], df[col2])
df_corr = pd.DataFrame(d_corr)
df_corr
| math score | reading score | writing score | |
|---|---|---|---|
| math score | 1.001001 | 0.818398 | 0.803445 |
| reading score | 0.818398 | 1.001001 | 0.955554 |
| writing score | 0.803445 | 0.955554 | 1.001001 |
plt.Figure(figsize=(5,5))
sns.heatmap(df_corr, annot=True, linewidths=1.5, cbar=False)
plt.title('Matriz de correlação')
Text(0.5, 1.0, 'Matriz de correlação')
gender, race/ethnicity, parental level of education, lunch e test preparation course. Para cada um desses parâmetros, realize uma única visualização (colocando os pontos de cada grupo em uma cor). Ao final, discorra se há algum padrão que pode ser detectado nessas análises.¶ETC = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for etc in ETC:
fig = px.scatter_3d(df, x='math score', y='reading score', z='writing score',
color=etc, title=f'GROUP: {etc.upper()}')
fig.show()